{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## 分组运算",
   "id": "188b5a4ac385197b"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T12:42:01.926697Z",
     "start_time": "2025-01-08T12:42:01.921489Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import numpy as np\n",
    "#分组后给名称加前缀\n",
    "dict_obj = {'key1' : ['a', 'b', 'a', 'b',\n",
    "                      'a', 'b', 'a', 'a'],\n",
    "            'key2' : ['one', 'one', 'two', 'three',\n",
    "                      'two', 'two', 'one', 'three'],\n",
    "            'data1': np.random.randint(1, 10, 8),\n",
    "            'data2': np.random.randint(1, 10, 8)}\n",
    "df_obj = pd.DataFrame(dict_obj)\n",
    "print(df_obj)\n",
    "print('-'*50)"
   ],
   "id": "2a39a1fa34e431a7",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  key1   key2  data1  data2\n",
      "0    a    one      3      7\n",
      "1    b    one      5      3\n",
      "2    a    two      9      2\n",
      "3    b  three      4      4\n",
      "4    a    two      8      2\n",
      "5    b    two      2      2\n",
      "6    a    one      5      2\n",
      "7    a  three      1      1\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "execution_count": 17
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T12:42:12.942130Z",
     "start_time": "2025-01-08T12:42:12.934994Z"
    }
   },
   "cell_type": "code",
   "source": "df_obj.info()",
   "id": "b5704814329a969e",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 8 entries, 0 to 7\n",
      "Data columns (total 4 columns):\n",
      " #   Column  Non-Null Count  Dtype \n",
      "---  ------  --------------  ----- \n",
      " 0   key1    8 non-null      object\n",
      " 1   key2    8 non-null      object\n",
      " 2   data1   8 non-null      int32 \n",
      " 3   data2   8 non-null      int32 \n",
      "dtypes: int32(2), object(2)\n",
      "memory usage: 324.0+ bytes\n"
     ]
    }
   ],
   "execution_count": 18
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T12:42:21.202230Z",
     "start_time": "2025-01-08T12:42:21.196075Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 按key1分组后，计算data1，data2的统计信息并附加到原始表格中，并添加表头前缀\n",
    "k1_sum = df_obj.groupby('key1').mean(numeric_only=True).add_prefix('mean_')\n",
    "print(k1_sum)"
   ],
   "id": "574c56ca28b43bec",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      mean_data1  mean_data2\n",
      "key1                        \n",
      "a       5.200000         2.8\n",
      "b       3.666667         3.0\n"
     ]
    }
   ],
   "execution_count": 19
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T12:42:28.953489Z",
     "start_time": "2025-01-08T12:42:28.943146Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 方法2，使用transform，分组后计算结果和原本的df保持一致\n",
    "k1_sum_tf = df_obj.loc[:,['key1','data1', 'data2']].groupby('key1').transform('mean').add_prefix('mean_')\n",
    "k1_sum_tf"
   ],
   "id": "4e4636a1fa8aa1b7",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   mean_data1  mean_data2\n",
       "0    5.200000         2.8\n",
       "1    3.666667         3.0\n",
       "2    5.200000         2.8\n",
       "3    3.666667         3.0\n",
       "4    5.200000         2.8\n",
       "5    3.666667         3.0\n",
       "6    5.200000         2.8\n",
       "7    5.200000         2.8"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean_data1</th>\n",
       "      <th>mean_data2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.200000</td>\n",
       "      <td>2.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.666667</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5.200000</td>\n",
       "      <td>2.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.666667</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.200000</td>\n",
       "      <td>2.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>3.666667</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5.200000</td>\n",
       "      <td>2.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>5.200000</td>\n",
       "      <td>2.8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 20
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T12:42:55.311428Z",
     "start_time": "2025-01-08T12:42:54.875370Z"
    }
   },
   "cell_type": "code",
   "source": "del df_obj['key2']",
   "id": "75506b6b134f6462",
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'key2'",
     "output_type": "error",
     "traceback": [
      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[1;31mKeyError\u001B[0m                                  Traceback (most recent call last)",
      "File \u001B[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\pandas\\core\\indexes\\base.py:3805\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m   3804\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m-> 3805\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcasted_key\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m   3806\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n",
      "File \u001B[1;32mindex.pyx:167\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n",
      "File \u001B[1;32mindex.pyx:196\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n",
      "File \u001B[1;32mpandas\\\\_libs\\\\hashtable_class_helper.pxi:7081\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n",
      "File \u001B[1;32mpandas\\\\_libs\\\\hashtable_class_helper.pxi:7089\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n",
      "\u001B[1;31mKeyError\u001B[0m: 'key2'",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001B[1;31mKeyError\u001B[0m                                  Traceback (most recent call last)",
      "Cell \u001B[1;32mIn[22], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[38;5;28;01mdel\u001B[39;00m \u001B[43mdf_obj\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mkey2\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m]\u001B[49m\n",
      "File \u001B[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\pandas\\core\\generic.py:4506\u001B[0m, in \u001B[0;36mNDFrame.__delitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m   4501\u001B[0m             deleted \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mTrue\u001B[39;00m\n\u001B[0;32m   4502\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m deleted:\n\u001B[0;32m   4503\u001B[0m     \u001B[38;5;66;03m# If the above loop ran and didn't delete anything because\u001B[39;00m\n\u001B[0;32m   4504\u001B[0m     \u001B[38;5;66;03m# there was no match, this call should raise the appropriate\u001B[39;00m\n\u001B[0;32m   4505\u001B[0m     \u001B[38;5;66;03m# exception:\u001B[39;00m\n\u001B[1;32m-> 4506\u001B[0m     loc \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43maxes\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;241;43m-\u001B[39;49m\u001B[38;5;241;43m1\u001B[39;49m\u001B[43m]\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m   4507\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_mgr \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_mgr\u001B[38;5;241m.\u001B[39midelete(loc)\n\u001B[0;32m   4509\u001B[0m \u001B[38;5;66;03m# delete from the caches\u001B[39;00m\n",
      "File \u001B[1;32m~\\AppData\\Roaming\\Python\\Python312\\site-packages\\pandas\\core\\indexes\\base.py:3812\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m   3807\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(casted_key, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;129;01mor\u001B[39;00m (\n\u001B[0;32m   3808\u001B[0m         \u001B[38;5;28misinstance\u001B[39m(casted_key, abc\u001B[38;5;241m.\u001B[39mIterable)\n\u001B[0;32m   3809\u001B[0m         \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28many\u001B[39m(\u001B[38;5;28misinstance\u001B[39m(x, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m casted_key)\n\u001B[0;32m   3810\u001B[0m     ):\n\u001B[0;32m   3811\u001B[0m         \u001B[38;5;28;01mraise\u001B[39;00m InvalidIndexError(key)\n\u001B[1;32m-> 3812\u001B[0m     \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01merr\u001B[39;00m\n\u001B[0;32m   3813\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mTypeError\u001B[39;00m:\n\u001B[0;32m   3814\u001B[0m     \u001B[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001B[39;00m\n\u001B[0;32m   3815\u001B[0m     \u001B[38;5;66;03m#  InvalidIndexError. Otherwise we fall through and re-raise\u001B[39;00m\n\u001B[0;32m   3816\u001B[0m     \u001B[38;5;66;03m#  the TypeError.\u001B[39;00m\n\u001B[0;32m   3817\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_check_indexing_error(key)\n",
      "\u001B[1;31mKeyError\u001B[0m: 'key2'"
     ]
    }
   ],
   "execution_count": 22
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T12:43:20.774567Z",
     "start_time": "2025-01-08T12:43:20.767668Z"
    }
   },
   "cell_type": "code",
   "source": "df_obj.groupby('key1').transform(np.mean)",
   "id": "bd40c9f324602db1",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\MECHREV\\AppData\\Local\\Temp\\ipykernel_34200\\2916770615.py:1: FutureWarning: The provided callable <function mean at 0x000001B0C2EC9F80> is currently using DataFrameGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
      "  df_obj.groupby('key1').transform(np.mean)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "      data1  data2\n",
       "0  5.200000    2.8\n",
       "1  3.666667    3.0\n",
       "2  5.200000    2.8\n",
       "3  3.666667    3.0\n",
       "4  5.200000    2.8\n",
       "5  3.666667    3.0\n",
       "6  5.200000    2.8\n",
       "7  5.200000    2.8"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data1</th>\n",
       "      <th>data2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.200000</td>\n",
       "      <td>2.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.666667</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5.200000</td>\n",
       "      <td>2.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.666667</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.200000</td>\n",
       "      <td>2.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>3.666667</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5.200000</td>\n",
       "      <td>2.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>5.200000</td>\n",
       "      <td>2.8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 23
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T12:43:51.255421Z",
     "start_time": "2025-01-08T12:43:51.248862Z"
    }
   },
   "cell_type": "code",
   "source": "df_obj",
   "id": "e015a399bb905028",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  key1  data1  data2\n",
       "0    a      3      7\n",
       "1    b      5      3\n",
       "2    a      9      2\n",
       "3    b      4      4\n",
       "4    a      8      2\n",
       "5    b      2      2\n",
       "6    a      5      2\n",
       "7    a      1      1"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key1</th>\n",
       "      <th>data1</th>\n",
       "      <th>data2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>b</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>a</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>b</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>a</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>b</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>a</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 24
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T12:43:58.333317Z",
     "start_time": "2025-01-08T12:43:58.326403Z"
    }
   },
   "cell_type": "code",
   "source": [
    "def diff_mean(s):\n",
    "    \"\"\"\n",
    "        返回数据与均值的差值，s传入的是某一个分组\n",
    "    \"\"\"\n",
    "    return s - s.mean()\n",
    "\n",
    "print(df_obj.groupby('key1').transform(diff_mean))"
   ],
   "id": "c0bcc498ce28d2bd",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      data1  data2\n",
      "0 -2.200000    4.2\n",
      "1  1.333333    0.0\n",
      "2  3.800000   -0.8\n",
      "3  0.333333    1.0\n",
      "4  2.800000   -0.8\n",
      "5 -1.666667   -1.0\n",
      "6 -0.200000   -0.8\n",
      "7 -4.200000   -1.8\n"
     ]
    }
   ],
   "execution_count": 25
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
